<!DOCTYPE html>
<html lang="zh-CN">
<head>
  <meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=2">
<meta name="theme-color" content="#222">
<meta name="generator" content="Hexo 5.4.0">
  <link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon-next.png">
  <link rel="icon" type="image/png" sizes="32x32" href="/images/favicon-32x32-next.png">
  <link rel="icon" type="image/png" sizes="16x16" href="/images/favicon-16x16-next.png">
  <link rel="mask-icon" href="/images/logo.svg" color="#222">

<link rel="stylesheet" href="/css/main.css">


<link rel="stylesheet" href="/lib/font-awesome/css/all.min.css">

<script id="hexo-configurations">
    var NexT = window.NexT || {};
    var CONFIG = {"hostname":"example.com","root":"/","scheme":"Pisces","version":"7.8.0","exturl":false,"sidebar":{"position":"left","display":"post","padding":18,"offset":12,"onmobile":false},"copycode":{"enable":false,"show_result":false,"style":null},"back2top":{"enable":true,"sidebar":false,"scrollpercent":true},"bookmark":{"enable":false,"color":"#222","save":"auto"},"fancybox":false,"mediumzoom":false,"lazyload":false,"pangu":false,"comments":{"style":"tabs","active":null,"storage":true,"lazyload":false,"nav":null},"algolia":{"hits":{"per_page":10},"labels":{"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}},"localsearch":{"enable":false,"trigger":"auto","top_n_per_article":1,"unescape":false,"preload":false},"motion":{"enable":true,"async":false,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}}};
  </script>

  <meta name="description" content="正式进入深度强化学习阶段，本篇介绍DQN的两个版本">
<meta property="og:type" content="article">
<meta property="og:title" content="深度强化学习2">
<meta property="og:url" content="http://example.com/2022/03/01/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/7-%E6%B7%B1%E5%BA%A6%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A02/index.html">
<meta property="og:site_name" content="祖浩の博客">
<meta property="og:description" content="正式进入深度强化学习阶段，本篇介绍DQN的两个版本">
<meta property="og:locale" content="zh_CN">
<meta property="og:image" content="http://example.com/2022/03/01/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/7-%E6%B7%B1%E5%BA%A6%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A02/image-20220301113203718.png">
<meta property="og:image" content="http://example.com/2022/03/01/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/7-%E6%B7%B1%E5%BA%A6%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A02/image-20220301163513836.png">
<meta property="og:image" content="http://example.com/2022/03/01/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/7-%E6%B7%B1%E5%BA%A6%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A02/image-20220301162118639.png">
<meta property="og:image" content="http://example.com/2022/03/01/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/7-%E6%B7%B1%E5%BA%A6%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A02/image-20220301163813030.png">
<meta property="article:published_time" content="2022-03-01T02:27:21.000Z">
<meta property="article:modified_time" content="2022-03-01T08:45:27.260Z">
<meta property="article:author" content="谢祖浩">
<meta property="article:tag" content="强化学习">
<meta property="article:tag" content="深度强化学习">
<meta property="article:tag" content="DQN">
<meta name="twitter:card" content="summary">
<meta name="twitter:image" content="http://example.com/2022/03/01/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/7-%E6%B7%B1%E5%BA%A6%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A02/image-20220301113203718.png">

<link rel="canonical" href="http://example.com/2022/03/01/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/7-%E6%B7%B1%E5%BA%A6%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A02/">


<script id="page-configurations">
  // https://hexo.io/docs/variables.html
  CONFIG.page = {
    sidebar: "",
    isHome : false,
    isPost : true,
    lang   : 'zh-CN'
  };
</script>

  <title>深度强化学习2 | 祖浩の博客</title>
  






  <noscript>
  <style>
  .use-motion .brand,
  .use-motion .menu-item,
  .sidebar-inner,
  .use-motion .post-block,
  .use-motion .pagination,
  .use-motion .comments,
  .use-motion .post-header,
  .use-motion .post-body,
  .use-motion .collection-header { opacity: initial; }

  .use-motion .site-title,
  .use-motion .site-subtitle {
    opacity: initial;
    top: initial;
  }

  .use-motion .logo-line-before i { left: initial; }
  .use-motion .logo-line-after i { right: initial; }
  </style>
</noscript>

</head>

<body itemscope itemtype="http://schema.org/WebPage">
  <div class="container use-motion">
    <div class="headband"></div>

    <header class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-container">
  <div class="site-nav-toggle">
    <div class="toggle" aria-label="切换导航栏">
      <span class="toggle-line toggle-line-first"></span>
      <span class="toggle-line toggle-line-middle"></span>
      <span class="toggle-line toggle-line-last"></span>
    </div>
  </div>

  <div class="site-meta">

    <a href="/" class="brand" rel="start">
      <span class="logo-line-before"><i></i></span>
      <h1 class="site-title">祖浩の博客</h1>
      <span class="logo-line-after"><i></i></span>
    </a>
  </div>

  <div class="site-nav-right">
    <div class="toggle popup-trigger">
    </div>
  </div>
</div>




<nav class="site-nav">
  <ul id="menu" class="main-menu menu">
        <li class="menu-item menu-item-home">

    <a href="/" rel="section"><i class="fa fa-home fa-fw"></i>首页</a>

  </li>
        <li class="menu-item menu-item-tags">

    <a href="/tags/" rel="section"><i class="fa fa-tags fa-fw"></i>标签</a>

  </li>
        <li class="menu-item menu-item-categories">

    <a href="/categories/" rel="section"><i class="fa fa-th fa-fw"></i>分类</a>

  </li>
        <li class="menu-item menu-item-archives">

    <a href="/archives/" rel="section"><i class="fa fa-archive fa-fw"></i>归档</a>

  </li>
  </ul>
</nav>




</div>
    </header>

    
  <div class="back-to-top">
    <i class="fa fa-arrow-up"></i>
    <span>0%</span>
  </div>


    <main class="main">
      <div class="main-inner">
        <div class="content-wrap">
          

          <div class="content post posts-expand">
            

    
  
  
  <article itemscope itemtype="http://schema.org/Article" class="post-block" lang="zh-CN">
    <link itemprop="mainEntityOfPage" href="http://example.com/2022/03/01/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/7-%E6%B7%B1%E5%BA%A6%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A02/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/images/head.jpeg">
      <meta itemprop="name" content="谢祖浩">
      <meta itemprop="description" content="驽马十驾，功在不舍">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="祖浩の博客">
    </span>
      <header class="post-header">
        <h1 class="post-title" itemprop="name headline">
          深度强化学习2
        </h1>

        <div class="post-meta">
            <span class="post-meta-item">
              <span class="post-meta-item-icon">
                <i class="far fa-calendar"></i>
              </span>
              <span class="post-meta-item-text">发表于</span>
              

              <time title="创建时间：2022-03-01 10:27:21 / 修改时间：16:45:27" itemprop="dateCreated datePublished" datetime="2022-03-01T10:27:21+08:00">2022-03-01</time>
            </span>
            <span class="post-meta-item">
              <span class="post-meta-item-icon">
                <i class="far fa-folder"></i>
              </span>
              <span class="post-meta-item-text">分类于</span>
                <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
                  <a href="/categories/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/" itemprop="url" rel="index"><span itemprop="name">强化学习</span></a>
                </span>
            </span>

          
            <div class="post-description">正式进入深度强化学习阶段，本篇介绍DQN的两个版本</div>

        </div>
      </header>

    
    
    
    <div class="post-body" itemprop="articleBody">

      
        <h1 id="deep-q-network">Deep Q-Network</h1>
<p>在上篇中所学的<span class="math inline">\(Q-learning\)</span>，使用如下公式来更新“状态-动作”价值：</p>
<p><span class="math display">\[
Q\left(S_{t}, A_{t}\right) \leftarrow Q\left(S_{t}, A_{t}\right)+\alpha\left[R_{t+1}+\gamma \max _{a} Q\left(S_{t+1}, a\right)-Q\left(S_{t}, A_{t}\right)\right]
\]</span></p>
<p>更新后的“状态-动作”价值存放在一个表格中，这个表格叫做"Q-table"。当状态、动作是有限个时，Q-table的大小是有限的。但如果状态、动作是高维的或连续的，Q-table将会急剧扩大，难以储存。</p>
<h2 id="值函数近似-value-function-approximation">值函数近似 Value Function Approximation</h2>
<p>为了解决连续或高维问题，提出值函数近似；即不使用Q-table来储存各种“状态-动作”的价值，而是用一个函数来近似表示其值，当需要使用时再计算：</p>
<p><span class="math display">\[
Q(s, a) \approx f(s, a, \theta)
\]</span></p>
<p>用于近似的函数有很多种，如：线性模型、决策树、最近邻，而DQN用的是神经网络，输入当前状态和动作后，神经网络输出"状态-动作"的值。</p>
<h2 id="dqn">DQN</h2>
<p>DQN使用的是深度学习方法，训练一个神经网络，首先需要的是训练集，而训练集中的每个样本都需要以“实例+标签”的形式出现。在深度学习中，标签相当于“真值”，如手写的图片6，其标签就是6。但对于一个强化学习算法来说，一开始"状态-动作"的值是无法求取的，那么如何获得标签呢？</p>
<p>在<span class="math inline">\(Q-learning\)</span>中，每次更新的目标值 是<span class="math inline">\(R_{t+1}+\gamma \max_{a} Q\left(S_{t+1},a\right)\)</span>，DQN借用了<span class="math inline">\(Q-learning\)</span>的概念，以此为标签。</p>
<p>同时，DQN的损失函数就是最普通的均方差：</p>
<p><span class="math display">\[
L(\theta)=\mathbb{E}\left[\left({\color{red} r+\gamma \max _{a^{\prime}} Q\left(s^{\prime}, a^{\prime}, \theta \right)}-Q(s, a, \theta)\right)^{2}\right]
\]</span></p>
<p>但想直接使用神经网络来训练仍然面临以下问题：</p>
<ol type="1">
<li>神经网络训练需要大量样本</li>
<li>深度学习的样本假设是独立同分布的，但强化学习的样本是强相关的</li>
</ol>
<p>DQN为此采用了经验回放机制（<span class="math inline">\(Experience~Replay\)</span>），把训练过的数据储存在回放缓存（<span class="math inline">\(Replay~Buffer\)</span>）中，以便后续从其中随机采样。这样使得数据的利用率提高，还减少了样本的相关性。</p>
<p>具体算法如下：</p>
<figure>
<img src="image-20220301113203718.png" alt="" /><figcaption>image-20220301113203718</figcaption>
</figure>
<p>无论是<span class="math inline">\(Q-learning\)</span>还是DQN，其对“状态-动作”值的更新都需要使用到下一个“状态-动作”的值（即：<span class="math inline">\(\max _a Q(S_{t+1},a)\)</span>部分）。</p>
<p>在<span class="math inline">\(Q-learning\)</span>中，因为Q-table的存在，下一个“状态-动作”的值先被随机初始化，而后不断根据公式更新并存储。</p>
<p>在DQN中，没有Q-table的存在，所以下一个“状态-动作”的值第一次是随机初始化的，之后都是根据网络实时计算所得。所以可以说，DQN中，标签是一直在变换的。</p>
<p><img src="image-20220301163513836.png" alt="image-20220301163513836" style="zoom:50%;" /></p>
<h3 id="目标网络-target-network">目标网络 Target Network</h3>
<p>在2015年，DQN的改进版本发表在nature上。主要的改动是加入了目标网络（<span class="math inline">\(Target~Network\)</span>），具体从损失函数说起：</p>
<p><span class="math display">\[
L_{i}\left(\theta_{i}\right)=\mathbb{E}_{s, a, s^{\prime}, r \sim D}(\underbrace{r+\gamma \max _{a^{\prime}} Q\left(s^{\prime}, a^{\prime} ; {\color{red}\theta_{i}^{-}}\right)}_{\text {target }}-Q\left(s, a ; \theta_{i}\right))^{2}
\]</span></p>
<p>对比先前的损失函数，唯一的改动就是目标中“状态-动作”值由另一个参数<span class="math inline">\(\theta_i^-\)</span>确定。</p>
<p>实际上，更新DQN使用了两个神经网络，他们结构相同，但内部参数不同，一个是<span class="math inline">\(\theta_i\)</span>，另一个是<span class="math inline">\(\theta_i^-\)</span>。在每次梯度下降时，不更新<span class="math inline">\(\theta_i^-\)</span>，只更新<span class="math inline">\(\theta\)</span>。规定每运行C步后让<span class="math inline">\(\theta_i^-=\theta_i\)</span>，<span class="math inline">\(\theta_i^-\)</span>所对应的网络被称为<span class="math inline">\(target~network\)</span>。</p>
<p>对应于公式中，可以理解为：在C步以内产生标签的网络是不变的，或者说目标不变。设置目标网络的目的是防止过拟合，具体过程如下：</p>
<figure>
<img src="image-20220301162118639.png" alt="" /><figcaption>image-20220301162118639</figcaption>
</figure>
<p>可见与先前的DQN几乎没有变化，只是多了对目标网络的操作。</p>
<p><img src="image-20220301163813030.png" alt="image-20220301163813030" style="zoom:50%;" /></p>

    </div>

    
    
    

      <footer class="post-footer">
          <div class="post-tags">
              <a href="/tags/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/" rel="tag"># 强化学习</a>
              <a href="/tags/%E6%B7%B1%E5%BA%A6%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/" rel="tag"># 深度强化学习</a>
              <a href="/tags/DQN/" rel="tag"># DQN</a>
          </div>

        


        
    <div class="post-nav">
      <div class="post-nav-item">
    <a href="/2022/02/28/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/6-%E6%B7%B1%E5%BA%A6%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A01/" rel="prev" title="6-深度强化学习1">
      <i class="fa fa-chevron-left"></i> 6-深度强化学习1
    </a></div>
      <div class="post-nav-item"></div>
    </div>
      </footer>
    
  </article>
  
  
  



          </div>
          

<script>
  window.addEventListener('tabs:register', () => {
    let { activeClass } = CONFIG.comments;
    if (CONFIG.comments.storage) {
      activeClass = localStorage.getItem('comments_active') || activeClass;
    }
    if (activeClass) {
      let activeTab = document.querySelector(`a[href="#comment-${activeClass}"]`);
      if (activeTab) {
        activeTab.click();
      }
    }
  });
  if (CONFIG.comments.storage) {
    window.addEventListener('tabs:click', event => {
      if (!event.target.matches('.tabs-comment .tab-content .tab-pane')) return;
      let commentClass = event.target.classList[1];
      localStorage.setItem('comments_active', commentClass);
    });
  }
</script>

        </div>
          
  
  <div class="toggle sidebar-toggle">
    <span class="toggle-line toggle-line-first"></span>
    <span class="toggle-line toggle-line-middle"></span>
    <span class="toggle-line toggle-line-last"></span>
  </div>

  <aside class="sidebar">
    <div class="sidebar-inner">

      <ul class="sidebar-nav motion-element">
        <li class="sidebar-nav-toc">
          文章目录
        </li>
        <li class="sidebar-nav-overview">
          站点概览
        </li>
      </ul>

      <!--noindex-->
      <div class="post-toc-wrap sidebar-panel">
          <div class="post-toc motion-element"><ol class="nav"><li class="nav-item nav-level-1"><a class="nav-link" href="#deep-q-network"><span class="nav-number">1.</span> <span class="nav-text">Deep Q-Network</span></a><ol class="nav-child"><li class="nav-item nav-level-2"><a class="nav-link" href="#%E5%80%BC%E5%87%BD%E6%95%B0%E8%BF%91%E4%BC%BC-value-function-approximation"><span class="nav-number">1.1.</span> <span class="nav-text">值函数近似 Value Function Approximation</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#dqn"><span class="nav-number">1.2.</span> <span class="nav-text">DQN</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#%E7%9B%AE%E6%A0%87%E7%BD%91%E7%BB%9C-target-network"><span class="nav-number">1.2.1.</span> <span class="nav-text">目标网络 Target Network</span></a></li></ol></li></ol></li></ol></div>
      </div>
      <!--/noindex-->

      <div class="site-overview-wrap sidebar-panel">
        <div class="site-author motion-element" itemprop="author" itemscope itemtype="http://schema.org/Person">
    <img class="site-author-image" itemprop="image" alt="谢祖浩"
      src="/images/head.jpeg">
  <p class="site-author-name" itemprop="name">谢祖浩</p>
  <div class="site-description" itemprop="description">驽马十驾，功在不舍</div>
</div>
<div class="site-state-wrap motion-element">
  <nav class="site-state">
      <div class="site-state-item site-state-posts">
          <a href="/archives/">
        
          <span class="site-state-item-count">9</span>
          <span class="site-state-item-name">日志</span>
        </a>
      </div>
      <div class="site-state-item site-state-categories">
            <a href="/categories/">
          
        <span class="site-state-item-count">2</span>
        <span class="site-state-item-name">分类</span></a>
      </div>
      <div class="site-state-item site-state-tags">
            <a href="/tags/">
          
        <span class="site-state-item-count">13</span>
        <span class="site-state-item-name">标签</span></a>
      </div>
  </nav>
</div>



      </div>

    </div>
  </aside>
  <div id="sidebar-dimmer"></div>


      </div>
    </main>

    <footer class="footer">
      <div class="footer-inner">
        

        

<div class="copyright">
  
  &copy; 
  <span itemprop="copyrightYear">2022</span>
  <span class="with-love">
    <i class="fa fa-heart"></i>
  </span>
  <span class="author" itemprop="copyrightHolder">谢祖浩</span>
</div>
  <div class="powered-by">由 <a href="https://hexo.io/" class="theme-link" rel="noopener" target="_blank">Hexo</a> & <a href="https://pisces.theme-next.org/" class="theme-link" rel="noopener" target="_blank">NexT.Pisces</a> 强力驱动
  </div>

        








      </div>
    </footer>
  </div>

  
  <script src="/lib/anime.min.js"></script>
  <script src="/lib/velocity/velocity.min.js"></script>
  <script src="/lib/velocity/velocity.ui.min.js"></script>

<script src="/js/utils.js"></script>

<script src="/js/motion.js"></script>


<script src="/js/schemes/pisces.js"></script>


<script src="/js/next-boot.js"></script>




  















  

  
      

<script>
  if (typeof MathJax === 'undefined') {
    window.MathJax = {
      loader: {
        source: {
          '[tex]/amsCd': '[tex]/amscd',
          '[tex]/AMScd': '[tex]/amscd'
        }
      },
      tex: {
        inlineMath: {'[+]': [['$', '$']]},
        tags: 'ams'
      },
      options: {
        renderActions: {
          findScript: [10, doc => {
            document.querySelectorAll('script[type^="math/tex"]').forEach(node => {
              const display = !!node.type.match(/; *mode=display/);
              const math = new doc.options.MathItem(node.textContent, doc.inputJax[0], display);
              const text = document.createTextNode('');
              node.parentNode.replaceChild(text, node);
              math.start = {node: text, delim: '', n: 0};
              math.end = {node: text, delim: '', n: 0};
              doc.math.push(math);
            });
          }, '', false],
          insertedScript: [200, () => {
            document.querySelectorAll('mjx-container').forEach(node => {
              let target = node.parentNode;
              if (target.nodeName.toLowerCase() === 'li') {
                target.parentNode.classList.add('has-jax');
              }
            });
          }, '', false]
        }
      }
    };
    (function () {
      var script = document.createElement('script');
      script.src = '//cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js';
      script.defer = true;
      document.head.appendChild(script);
    })();
  } else {
    MathJax.startup.document.state(0);
    MathJax.texReset();
    MathJax.typeset();
  }
</script>

    

  

</body>
</html>
